import re
from urllib.request import urlopen

"""1、用 Python 登录网页 """
# decode() 成可以正常显示中文的形式
html = urlopen(
    "https://morvanzhou.github.io/static/scraping/basic-structure.html"
).read().decode('utf-8')
print(html) # 查看网页源代码
"""2、匹配网页内容 """
res_01 = re.findall(r"<title>(.+?)</title>",html)
print("\npage title is:",res_01[0])
res_02 = re.findall(r"<p>(.+?)</p>",html,flags=re.DOTALL)
print("page paragraph is:",res_02[0])
# 找一找所有的链接
res_03 = re.findall(r'href="(.*?)"',html)
print("\nall links:",res_03)